#!/bin/bash

source /root/.wasmedge/env

export wasm="llama-api-server.wasm"

wasmedge --dir .:. \
  --nn-preload default:GGML:AUTO:${LLMDATA}/${MODEL_FILE} ${wasm} \
  -p ${PROMPT_TEMPLATE} \
  -c 8096 -b 4096 -n 4096 \
  --socket-addr 0.0.0.0:8080 \
  --log-prompts --log-stat \
  --model-name ${MODEL_NAME}

# export rag="rag-api-server.wasm"
# wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-2-7b-chat-hf-Q5_K_M.gguf \
# --nn-preload embedding:GGML:AUTO:all-MiniLM-L6-v2-ggml-model-f16.gguf \
# rag-api-server.wasm \
# --model-name Llama-2-7b-chat-hf-Q5_K_M,all-MiniLM-L6-v2-ggml-model-f16 \
# --ctx-size 4096,384 \
# --prompt-template llama-2-chat \
# --rag-prompt "Use the following pieces of context to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n" \
# --log-prompts \
# --log-stat

# wasmedge --dir .:. \
#     --nn-preload default:GGML:AUTO:Qwen2.5-14B-instruct-Q5_K_M.gguf \
#     --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5.f16.gguf \
#     llama-api-server.wasm \
#     --model-alias default,embedding \
#     --model-name Qwen2.5-14B-instruct,nomic-embed \
#     --prompt-template chatml,embedding \
#     --batch-size 128,8192 \
#     --ctx-size 4096,8192
